1 Info

Here I train the 16x video classifiers with the merged data, i.e. using stock culture data from every time point and light setting recorded.

2 Dependencies

library(tidyverse)
library(randomForest)
library(readr)
library(viridis)
library(e1071)
library(caret)
library(parallel)
library(doParallel)

set.seed(1)

3 Data

3.1 Normal light

3.1.1 1st data (2020-2021)

colnames <- c("Particle_ID","Area_ABD","Area_Filled","Aspect_Ratio","Average_Blue",
              "Average_Green","Average_Red","Calibration_Factor","Calibration_Image",
              "Camera","Capture_X","Capture_Y","Ch1_Area","Ch1_Peak","Ch1_Width",
              "Ch2_Area","Ch2_Peak","Ch2_Width","Ch2_Ch1_Ratio","Circle_Fit",
              "Circularity","Circularity_Hu","Compactness","Convex_Perimeter",
              "Convexity","Date","Diameter_ABD","Diameter_ESD","Edge_Gradient",
              "Elongation","Feret_Angle_Max","Feret_Angle_Min","Fiber_Curl",
              "Fiber_Straightness","Filter_Score","Geodesic_Aspect_Ratio",
              "Geodesic_Length","Geodesic_Thickness","Image_File","Image_Height",
              "Image_Width","Image_X","Image_Y","Intensity","Length",
              "Particles_Per_Chain","Perimeter","Ratio_Blue_Green","Ratio_Red_Blue",
              "Ratio_Red_Green","Roughness","Scatter_Area","Scatter_Peak","Scatter_Width",
              "Sigma_Intensity","Source_Image","Sum_Intensity","Symmetry","Time",
              "Timestamp","Transparency","Volume_ABD","Volume_ESD","Width","species")

dd_all <- read_csv("../Class_18C_normalLight/1st_class_2020/Data/libraries_FC100/dd_all.csv", show_col_types = FALSE)
# dd_all$species <- ifelse(dd_all$species=="small_unidentified_possibly_small_digested_algae", "Small_unidentified",dd_all$species)

# table(dd_all$species)

3.1.2 2nd data (2022 Feb)

Chlamydomonas <- read_csv("../Class_18C_normalLight/2nd_class_2022Feb/Data/Chlamydomonas_light_February_2022_library.csv", show_col_types = FALSE)
Chlamydomonas$species <- "Chlamydomonas"

Cosmarium <- read_csv("../Class_18C_normalLight/2nd_class_2022Feb/Data/Cosmarium_light_February_2022_library.csv", show_col_types = FALSE)
Cosmarium$species <- "Cosmarium"

Desmodesmus <- read_csv("../Class_18C_normalLight/2nd_class_2022Feb/Data/Desmodesmus_light_February_2022_library.csv", show_col_types = FALSE)
Desmodesmus$species <- "Desmodesmus"

Monoraphidium <- read_csv("../Class_18C_normalLight/2nd_class_2022Feb/Data/Monoraphidium_light_February_2022_library.csv", show_col_types = FALSE)
Monoraphidium$species <- "Monoraphidium"

Staurastrum1 <- read_csv("../Class_18C_normalLight/2nd_class_2022Feb/Data/Staurastrum1_light_February_2022_library.csv", show_col_types = FALSE)
Staurastrum1$species <- "Staurastrum1"

Staurastrum2 <- read_csv("../Class_18C_normalLight/2nd_class_2022Feb/Data/Staurastrum2_light_February_2022_library.csv", show_col_types = FALSE)
Staurastrum2$species <- "Staurastrum2"

dd_all_feb22 <- rbind(Chlamydomonas,Cosmarium,Desmodesmus,
                      Monoraphidium,Staurastrum1,Staurastrum2) 

colnames(dd_all_feb22) <- colnames
  
# table(dd_all_feb22$species)

3.1.3 3rd data (20220316)

Chlamydomonas <- read_csv("../Class_18C_normalLight/3rd_data_20220316/Data/Chlamydomonas_light_March_2022.csv", show_col_types = FALSE)
Chlamydomonas$species <- "Chlamydomonas"

Cosmarium <- read_csv("../Class_18C_normalLight/3rd_data_20220316/Data/Cosmarium_light_March_2022.csv", show_col_types = FALSE)
Cosmarium$species <- "Cosmarium"

Cryptomonas <- read_csv("../Class_18C_normalLight/3rd_data_20220316/Data/Cryptomonas_light_March_2022.csv", show_col_types = FALSE)
Cryptomonas$species <- "Cryptomonas"

Desmodesmus <- read_csv("../Class_18C_normalLight/3rd_data_20220316/Data/Desmodesmus_light_March_2022.csv", show_col_types = FALSE)
Desmodesmus$species <- "Desmodesmus"

Monoraphidium <- read_csv("../Class_18C_normalLight/3rd_data_20220316/Data/Monoraphidium_light_March_2022.csv", show_col_types = FALSE)
Monoraphidium$species <- "Monoraphidium"

Staurastrum1 <- read_csv("../Class_18C_normalLight/3rd_data_20220316/Data/Staurastrum1_light_March_2022.csv", show_col_types = FALSE)
Staurastrum1$species <- "Staurastrum1"

Staurastrum2 <- read_csv("../Class_18C_normalLight/3rd_data_20220316/Data/Staurastrum2_light_March_2022.csv", show_col_types = FALSE)
Staurastrum2$species <- "Staurastrum2"

dd_all_20220316 <- rbind(Chlamydomonas,Cosmarium,Cryptomonas,Desmodesmus,
                      Monoraphidium,Staurastrum1,Staurastrum2) 

colnames(dd_all_20220316) <- colnames
  
# table(dd_all_20220316$species)

3.1.4 4th data (20220406)

Chlamydomonas <- read_csv("../Class_18C_normalLight/4th_data_20220406/Data/Chlamydomonas_light_April_2022.csv", show_col_types = FALSE)
Chlamydomonas$species <- "Chlamydomonas"

Cosmarium <- read_csv("../Class_18C_normalLight/4th_data_20220406/Data/Cosmarium_light_April_2022.csv", show_col_types = FALSE)
Cosmarium$species <- "Cosmarium"

Cryptomonas <- read_csv("../Class_18C_normalLight/4th_data_20220406/Data/Cryptomonas_light_April_2022.csv", show_col_types = FALSE)
Cryptomonas$species <- "Cryptomonas"

Desmodesmus <- read_csv("../Class_18C_normalLight/4th_data_20220406/Data/Desmodesmus_light_April_2022.csv", show_col_types = FALSE)
Desmodesmus$species <- "Desmodesmus"

Monoraphidium <- read_csv("../Class_18C_normalLight/4th_data_20220406/Data/Monoraphidium_light_April_2022.csv", show_col_types = FALSE)
Monoraphidium$species <- "Monoraphidium"

Staurastrum1 <- read_csv("../Class_18C_normalLight/4th_data_20220406/Data/Staurastrum1_light_April_2022.csv", show_col_types = FALSE)
Staurastrum1$species <- "Staurastrum1"

Staurastrum2 <- read_csv("../Class_18C_normalLight/4th_data_20220406/Data/Staurastrum2_light_April_2022.csv", show_col_types = FALSE)
Staurastrum2$species <- "Staurastrum2"

dd_all_20220406 <- rbind(Chlamydomonas,Cosmarium,Cryptomonas,Desmodesmus,
                      Monoraphidium,Staurastrum1,Staurastrum2) 

colnames(dd_all_20220406) <- colnames
  
# table(dd_all_20220406$species)

3.1.5 5th data (20220710)

Chlamydomonas <- read_csv("../Class_18C_normalLight/5th_data_20220710/Data/Chlamydomonas_light_July_2022.csv", show_col_types = FALSE)
Chlamydomonas$species <- "Chlamydomonas"

Cosmarium <- read_csv("../Class_18C_normalLight/5th_data_20220710/Data/Cosmarium_light_July_2022.csv", show_col_types = FALSE)
Cosmarium$species <- "Cosmarium"

Cryptomonas <- read_csv("../Class_18C_normalLight/5th_data_20220710/Data/Cryptomonas_light_July_2022.csv", show_col_types = FALSE)
Cryptomonas$species <- "Cryptomonas"

Desmodesmus <- read_csv("../Class_18C_normalLight/5th_data_20220710/Data/Desmodesmus_light_July_2022.csv", show_col_types = FALSE)
Desmodesmus$species <- "Desmodesmus"

Monoraphidium <- read_csv("../Class_18C_normalLight/5th_data_20220710/Data/Monoraphidium_light_July_2022.csv", show_col_types = FALSE)
Monoraphidium$species <- "Monoraphidium"

# Staurastrum1 <- read_csv("../Class_18C_normalLight/5th_data_20220710/Data/Staurastrum2_light_July_2022.csv", show_col_types = FALSE)
# Staurastrum1$species <- "Staurastrum1"

Staurastrum2 <- read_csv("../Class_18C_normalLight/5th_data_20220710/Data/Staurastrum2_light_July_2022.csv", show_col_types = FALSE)
Staurastrum2$species <- "Staurastrum2"

dd_all_20220710 <- rbind(Chlamydomonas,Cosmarium,Cryptomonas,Desmodesmus,
                      Monoraphidium,
                      # Staurastrum1,
                      Staurastrum2) 

colnames(dd_all_20220710) <- colnames
  
# table(dd_all_20220710$species)

3.2 Decreasing light

3.2.1 2nd data (2022 Feb) - 18 percent

Chlamydomonas <- read_csv("../Class_18C_decreasingLight/Light_18perc/Data/Chlamydomonas_dark_February_2022_library.csv", show_col_types = FALSE)
Chlamydomonas$species <- "Chlamydomonas"

Cosmarium <- read_csv("../Class_18C_decreasingLight/Light_18perc/Data/Cosmarium_dark_February_2022_library.csv", show_col_types = FALSE)
Cosmarium$species <- "Cosmarium"

Desmodesmus <- read_csv("../Class_18C_decreasingLight/Light_18perc/Data/Desmodesmus_dark_February_2022_library.csv", show_col_types = FALSE)
Desmodesmus$species <- "Desmodesmus"

Monoraphidium <- read_csv("../Class_18C_decreasingLight/Light_18perc/Data/Monoraphidium_dark_February_2022_library.csv", show_col_types = FALSE)
Monoraphidium$species <- "Monoraphidium"

Staurastrum1 <- read_csv("../Class_18C_decreasingLight/Light_18perc/Data/Staurastrum1_dark_February_2022_library.csv", show_col_types = FALSE)
Staurastrum1$species <- "Staurastrum1"

Staurastrum2 <- read_csv("../Class_18C_decreasingLight/Light_18perc/Data/Staurastrum2_dark_February_2022_library.csv", show_col_types = FALSE)
Staurastrum2$species <- "Staurastrum2"

Cryptomonas <- read_csv("../Class_18C_decreasingLight/Light_18perc/Data/Cryptomonas_dark_February_2022_library.csv", show_col_types = FALSE)
Cryptomonas$species <- "Cryptomonas"

dd_all_feb22_dark <- rbind(Chlamydomonas,Cosmarium,Desmodesmus,
                           Monoraphidium,Staurastrum1,Staurastrum2,
                           Cryptomonas) 

colnames(dd_all_feb22_dark) <- colnames
  
# table(dd_all_feb22_dark$species)

3.2.2 3rd data (20220316) - 6 percent

Chlamydomonas <- read_csv("../Class_18C_decreasingLight/Light_6perc/Data/Chlamydomonas_dark_March_2022.csv", show_col_types = FALSE)
Chlamydomonas$species <- "Chlamydomonas"

Cosmarium <- read_csv("../Class_18C_decreasingLight/Light_6perc/Data/Cosmarium_dark_March_2022.csv", show_col_types = FALSE)
Cosmarium$species <- "Cosmarium"

Cryptomonas <- read_csv("../Class_18C_decreasingLight/Light_6perc/Data/Cryptomonas_dark_March_2022.csv", show_col_types = FALSE)
Cryptomonas$species <- "Cryptomonas"

Desmodesmus <- read_csv("../Class_18C_decreasingLight/Light_6perc/Data/Desmodesmus_dark_March_2022.csv", show_col_types = FALSE)
Desmodesmus$species <- "Desmodesmus"

Monoraphidium <- read_csv("../Class_18C_decreasingLight/Light_6perc/Data/Monoraphidium_dark_March_2022.csv", show_col_types = FALSE)
Monoraphidium$species <- "Monoraphidium"

Staurastrum1 <- read_csv("../Class_18C_decreasingLight/Light_6perc/Data/Staurastrum1_dark_March_2022.csv", show_col_types = FALSE)
Staurastrum1$species <- "Staurastrum1"

Staurastrum2 <- read_csv("../Class_18C_decreasingLight/Light_6perc/Data/Staurastrum2_dark_March_2022.csv", show_col_types = FALSE)
Staurastrum2$species <- "Staurastrum2"

dd_all_20220316_dark <- rbind(Chlamydomonas,Cosmarium,Cryptomonas,Desmodesmus,
                              Monoraphidium,Staurastrum1,Staurastrum2) 

colnames(dd_all_20220316_dark) <- colnames
  
# table(dd_all_20220316_dark$species)

rm(Chlamydomonas, Cosmarium, Cryptomonas, Desmodesmus, Monoraphidium, Staurastrum1, Staurastrum2)

3.2.3 4th data (20220406) - 1 percent

Chlamydomonas <- read_csv("../Class_18C_decreasingLight/Light_1perc/Data/Chlamydomonas_dark_April_2022.csv", show_col_types = FALSE)
Chlamydomonas$species <- "Chlamydomonas"

Cosmarium <- read_csv("../Class_18C_decreasingLight/Light_1perc/Data/Cosmarium_dark_April_2022.csv", show_col_types = FALSE)
Cosmarium$species <- "Cosmarium"

Cryptomonas <- read_csv("../Class_18C_decreasingLight/Light_1perc/Data/Cryptomonas_dark_April_2022.csv", show_col_types = FALSE)
Cryptomonas$species <- "Cryptomonas"

Desmodesmus <- read_csv("../Class_18C_decreasingLight/Light_1perc/Data/Desmodesmus_dark_April_2022.csv", show_col_types = FALSE)
Desmodesmus$species <- "Desmodesmus"

Monoraphidium <- read_csv("../Class_18C_decreasingLight/Light_1perc/Data/Monoraphidium_dark_April_2022.csv", show_col_types = FALSE)
Monoraphidium$species <- "Monoraphidium"

Staurastrum1 <- read_csv("../Class_18C_decreasingLight/Light_1perc/Data/Staurastrum1_dark_April_2022.csv", show_col_types = FALSE)
Staurastrum1$species <- "Staurastrum1"

Staurastrum2 <- read_csv("../Class_18C_decreasingLight/Light_1perc/Data/Staurastrum2_dark_April_2022.csv", show_col_types = FALSE)
Staurastrum2$species <- "Staurastrum2"

dd_all_20220406_dark <- rbind(Chlamydomonas,Cosmarium,Cryptomonas,Desmodesmus,
                              Monoraphidium,Staurastrum1,Staurastrum2) 

colnames(dd_all_20220406_dark) <- colnames
  
# table(dd_all_20220406_dark$species)

rm(Chlamydomonas, Cosmarium, Cryptomonas, Desmodesmus, Monoraphidium, Staurastrum1, Staurastrum2)

3.2.4 5th data (20220710) - 1 percent

Chlamydomonas <- read_csv("../Class_18C_decreasingLight/Light_1perc/Data20220710/Chlamydomonas_dark_July_2022.csv", show_col_types = FALSE)
Chlamydomonas$species <- "Chlamydomonas"

Cosmarium <- read_csv("../Class_18C_decreasingLight/Light_1perc/Data20220710/Cosmarium_dark_July_2022.csv", show_col_types = FALSE)
Cosmarium$species <- "Cosmarium"

Cryptomonas <- read_csv("../Class_18C_decreasingLight/Light_1perc/Data20220710/Cryptomonas_dark_July_2022.csv", show_col_types = FALSE)
Cryptomonas$species <- "Cryptomonas"

Desmodesmus <- read_csv("../Class_18C_decreasingLight/Light_1perc/Data20220710/Desmodesmus_dark_July_2022.csv", show_col_types = FALSE)
Desmodesmus$species <- "Desmodesmus"

Monoraphidium <- read_csv("../Class_18C_decreasingLight/Light_1perc/Data20220710/Monoraphidium_dark_July_2022.csv", show_col_types = FALSE)
Monoraphidium$species <- "Monoraphidium"

# Staurastrum1 <- read_csv("../Class_18C_decreasingLight/Light_1perc/Data20220710/Staurastrum1_dark", show_col_types = FALSE)
# Staurastrum1$species <- "Staurastrum1"

Staurastrum2 <- read_csv("../Class_18C_decreasingLight/Light_1perc/Data20220710/Staurastrum2_dark_July_2022.csv", show_col_types = FALSE)
Staurastrum2$species <- "Staurastrum2"

dd_all_20220710_dark <- rbind(Chlamydomonas,Cosmarium,Cryptomonas,Desmodesmus,
                              Monoraphidium,
                              # Staurastrum1,
                              Staurastrum2) 

colnames(dd_all_20220710_dark) <- colnames
  
# table(dd_all_20220710_dark$species)

rm(Chlamydomonas, Cosmarium, Cryptomonas, Desmodesmus, Monoraphidium, 
   Staurastrum1,
   Staurastrum2)

4 Further data Processing

4.1 Merge data

dd_all$data <- "Late 2020"
dd_all_feb22$data <- "20220201"
dd_all_20220316$data <- "20220316"
dd_all_20220406$data <- "20220406"
dd_all_20220710$data <- "20220710"
dd <- rbind(dd_all,dd_all_feb22,dd_all_20220316,dd_all_20220406, dd_all_20220710)

dd_all_feb22_dark$data <- "20220201"
dd_all_20220316_dark$data <- "20220316"
dd_all_20220406_dark$data <- "20220406"
dd_all_20220710_dark$data <- "20220710"

dd$light <- "30%"
dd_all_feb22_dark$light <- "18%"
dd_all_20220316_dark$light <- "6%"
dd_all_20220406_dark$light <- "1%"
dd_all_20220710_dark$light <- "1%"
dd <- rbind(dd,dd_all_feb22_dark,dd_all_20220316_dark,dd_all_20220406_dark,dd_all_20220710_dark)
# table(dd$species, dd$data, dd$light)

4.2 filtering out outliers

If an individual is an outlier in more than 4 traits it gets excluded (about 6% gets excluded). Outlier based on boxplot definition.

dd$id <- 1:nrow(dd)
dd_long <- dd %>%
  dplyr::select(species, Area_ABD,Area_Filled,Aspect_Ratio,Average_Blue,
                Average_Green,Average_Red,Circle_Fit,Circularity,
                Compactness,Convexity,Diameter_ABD,Diameter_ESD,Edge_Gradient,
                Elongation,Feret_Angle_Max,Feret_Angle_Min,Fiber_Curl,
                Fiber_Straightness,Geodesic_Aspect_Ratio,Intensity,
                Length,Perimeter,Ratio_Blue_Green,Ratio_Red_Blue,
                Ratio_Red_Green,Roughness,Sigma_Intensity,Sum_Intensity,
                Symmetry,Transparency,Width, id, data, light) %>%
  pivot_longer(cols=-c(id,species, data, light), names_to="variable") %>%
  dplyr::group_by(variable,species,data,light) %>%
  mutate(iqr = IQR(value),
         first_quartile = quantile(value, probs = 0.25),
         third_quartile = quantile(value, probs = 0.75),
         outlier = ifelse(value<first_quartile-1.5*iqr | value>third_quartile+1.5*iqr,T,F))

outliers <- dd_long %>%
  dplyr::filter(outlier==T) %>%
  dplyr::group_by(species, id,data,light) %>%
  dplyr::summarise(n = n()) %>%
  dplyr::filter(n>4)
## `summarise()` has grouped output by 'species', 'id', 'data'. You can override using the `.groups` argument.
# table(outliers$species)
# nrow(outliers)/nrow(dd)

dd_filtered <- dd %>%
  dplyr::filter(!is.element(id,outliers$id))

dd$removed_outliers <- F
dd_filtered$removed_outliers <- T

dd_comparison <- rbind(dd,dd_filtered)

# dd_comparison %>%
#   ggplot(aes(log10(Area_ABD), col=removed_outliers))+
#   geom_histogram(aes(fill=removed_outliers, y=..density..), position = "identity", alpha=0.3) +
#   geom_boxplot(outlier.alpha = 0.3) +
#   theme_bw() +
#   facet_wrap(species~interaction(light,data), scales = "free_y")  +
#   geom_vline(xintercept = 1, col="black")
# 
# dd_filtered %>%
#   ggplot(aes(log10(Area_ABD)))+
#   geom_density(aes(col=species))

4.3 Sub-sampling to create training data

We have data from several time points and light settings. The goals is that in the final training data these are somewhat equally represented, regardless that the number of individuals tracked per species and time point can vary a lot (in other words: “even it out” across species, time point and light setting). This is important, as otherwise it can be that groups that are more abundant are weighted more.

dd <- dd %>%
  mutate(data.light = interaction(data,light, drop = T),
         data.light.species=interaction(data,light,species, drop = T))

print("number of individuals per species per date and light setting")
## [1] "number of individuals per species per date and light setting"
tab <- table(dd$data.light, dd$species)
tab
##                
##                 airbubbles Chlamydomonas ChlamydomonasClumps Coleps_irchel
##   20220406.1%            0          2035                   0             0
##   20220710.1%            0          3449                   0             0
##   20220201.18%           0          3023                   0             0
##   20220201.30%           0          6004                   0             0
##   20220316.30%           0          6463                   0             0
##   20220406.30%           0          4391                   0             0
##   20220710.30%           0          3240                   0             0
##   Late 2020.30%        112          1099                 359           475
##   20220316.6%            0          4410                   0             0
##                
##                 Colpidium ColpidiumVacuoles Cosmarium Cryptomonas Debris
##   20220406.1%           0                 0        93        2162      0
##   20220710.1%           0                 0       135        8143      0
##   20220201.18%          0                 0       176        2987      0
##   20220201.30%          0                 0      1103           0      0
##   20220316.30%          0                 0       328        3334      0
##   20220406.30%          0                 0       429        2901      0
##   20220710.30%          0                 0        34        3728      0
##   Late 2020.30%       101               264       341         781    339
##   20220316.6%           0                 0       164        2239      0
##                
##                 Desmodesmus Dexiostoma DigestedAlgae DividingChlamydomonas
##   20220406.1%           874          0             0                     0
##   20220710.1%          1088          0             0                     0
##   20220201.18%         1113          0             0                     0
##   20220201.30%         5122          0             0                     0
##   20220316.30%         4068          0             0                     0
##   20220406.30%         3655          0             0                     0
##   20220710.30%         1060          0             0                     0
##   Late 2020.30%        1051        160           154                   398
##   20220316.6%          2573          0             0                     0
##                
##                 Loxocephallus Monoraphidium OtherCiliate Small_unidentified
##   20220406.1%               0           578            0                  0
##   20220710.1%               0           665            0                  0
##   20220201.18%              0          1916            0                  0
##   20220201.30%              0           746            0                  0
##   20220316.30%              0          1062            0                  0
##   20220406.30%              0          1026            0                  0
##   20220710.30%              0           638            0                  0
##   Late 2020.30%           224          1489          164               4642
##   20220316.6%               0           855            0                  0
##                
##                 Staurastrum1 Staurastrum2 Tetrahymena
##   20220406.1%            212           91           0
##   20220710.1%              0           15           0
##   20220201.18%           111          124           0
##   20220201.30%           589          126           0
##   20220316.30%           244           52           0
##   20220406.30%           604           89           0
##   20220710.30%             0           35           0
##   Late 2020.30%          510           75         108
##   20220316.6%            225           94           0
print("Sum of individuals per species")
## [1] "Sum of individuals per species"
colsums <- colSums(tab)
colsums
##            airbubbles         Chlamydomonas   ChlamydomonasClumps 
##                   112                 34114                   359 
##         Coleps_irchel             Colpidium     ColpidiumVacuoles 
##                   475                   101                   264 
##             Cosmarium           Cryptomonas                Debris 
##                  2803                 26275                   339 
##           Desmodesmus            Dexiostoma         DigestedAlgae 
##                 20604                   160                   154 
## DividingChlamydomonas         Loxocephallus         Monoraphidium 
##                   398                   224                  8975 
##          OtherCiliate    Small_unidentified          Staurastrum1 
##                   164                  4642                  2495 
##          Staurastrum2           Tetrahymena 
##                   701                   108

Clearly, the amount of individuals per class varies a lot… I think the most limiting factor is Staurastrum2, i.e. an algae we are interested in but of which only have rather few individuals. For now at least I try to take 2 * Staurastrum2 as the number of individuals per species to be included (if a species/class has less than that all individuals are included).

Within a class I use that number so that roughly equally many individuals across time steps and light settings are included. As a last thing I divide the data into a training dataset and in a test dataset.

n <- colsums["Staurastrum2"]*3
print(paste0("Max number of individuals used per species (if there are fewer for a species, then all are used): n=",n))
## [1] "Max number of individuals used per species (if there are fewer for a species, then all are used): n=2103"
num_ind_per_class <- dd %>% group_by(species) %>% 
  summarize(num_class = length(unique(data.light.species)),
            num_ind_per_class = floor(n/num_class)) %>%
  select(species,num_ind_per_class)

num_ind_per_class_vec <- c(num_ind_per_class$num_ind_per_class)
names(num_ind_per_class_vec) <- num_ind_per_class$species

set.seed(53)

split_up <- split(dd, f = dd$data.light.species)
split_up <- lapply(split_up, function(x) {
  species <- unique(x$species)
  x %>% sample_n(ifelse(nrow(x) < num_ind_per_class_vec[species], nrow(x), num_ind_per_class_vec[species]))})
trainingdata <- do.call("rbind", split_up)

print("Subsampled: number of individuals per species per date and light setting")
## [1] "Subsampled: number of individuals per species per date and light setting"
tab2 <- table(trainingdata$data.light, trainingdata$species)
tab2
##                
##                 airbubbles Chlamydomonas ChlamydomonasClumps Coleps_irchel
##   20220406.1%            0           233                   0             0
##   20220710.1%            0           233                   0             0
##   20220201.18%           0           233                   0             0
##   20220201.30%           0           233                   0             0
##   20220316.30%           0           233                   0             0
##   20220406.30%           0           233                   0             0
##   20220710.30%           0           233                   0             0
##   Late 2020.30%        112           233                 359           475
##   20220316.6%            0           233                   0             0
##                
##                 Colpidium ColpidiumVacuoles Cosmarium Cryptomonas Debris
##   20220406.1%           0                 0        93         262      0
##   20220710.1%           0                 0       135         262      0
##   20220201.18%          0                 0       176         262      0
##   20220201.30%          0                 0       233           0      0
##   20220316.30%          0                 0       233         262      0
##   20220406.30%          0                 0       233         262      0
##   20220710.30%          0                 0        34         262      0
##   Late 2020.30%       101               264       233         262    339
##   20220316.6%           0                 0       164         262      0
##                
##                 Desmodesmus Dexiostoma DigestedAlgae DividingChlamydomonas
##   20220406.1%           233          0             0                     0
##   20220710.1%           233          0             0                     0
##   20220201.18%          233          0             0                     0
##   20220201.30%          233          0             0                     0
##   20220316.30%          233          0             0                     0
##   20220406.30%          233          0             0                     0
##   20220710.30%          233          0             0                     0
##   Late 2020.30%         233        160           154                   398
##   20220316.6%           233          0             0                     0
##                
##                 Loxocephallus Monoraphidium OtherCiliate Small_unidentified
##   20220406.1%               0           233            0                  0
##   20220710.1%               0           233            0                  0
##   20220201.18%              0           233            0                  0
##   20220201.30%              0           233            0                  0
##   20220316.30%              0           233            0                  0
##   20220406.30%              0           233            0                  0
##   20220710.30%              0           233            0                  0
##   Late 2020.30%           224           233          164               2103
##   20220316.6%               0           233            0                  0
##                
##                 Staurastrum1 Staurastrum2 Tetrahymena
##   20220406.1%            212           91           0
##   20220710.1%              0           15           0
##   20220201.18%           111          124           0
##   20220201.30%           300          126           0
##   20220316.30%           244           52           0
##   20220406.30%           300           89           0
##   20220710.30%             0           35           0
##   Late 2020.30%          300           75         108
##   20220316.6%            225           94           0
print("Subsampled: Sum of individuals per species")
## [1] "Subsampled: Sum of individuals per species"
colsums2 <- colSums(tab2)
colsums2
##            airbubbles         Chlamydomonas   ChlamydomonasClumps 
##                   112                  2097                   359 
##         Coleps_irchel             Colpidium     ColpidiumVacuoles 
##                   475                   101                   264 
##             Cosmarium           Cryptomonas                Debris 
##                  1534                  2096                   339 
##           Desmodesmus            Dexiostoma         DigestedAlgae 
##                  2097                   160                   154 
## DividingChlamydomonas         Loxocephallus         Monoraphidium 
##                   398                   224                  2097 
##          OtherCiliate    Small_unidentified          Staurastrum1 
##                   164                  2103                  1692 
##          Staurastrum2           Tetrahymena 
##                   701                   108
trainingdata <- trainingdata[complete.cases(trainingdata),]

# A stratified random split of the data
idx_train <- createDataPartition(trainingdata$species,
                                 p = 0.65, # percentage of data as training
                                 list = FALSE)

testdata <- trainingdata[-idx_train,]
trainingdata <- trainingdata[idx_train,]

5 Loading in species compositions

species <- unique(dd_all$species)
species <- species[!is.element(species,c("airbubbles","ColpidiumVacuoles","Debris","Small_unidentified",
                                         "OtherCiliate","ChlamydomonasClumps","DigestedAlgae","DividingChlamydomonas"))]
compositions <- read_csv("../compositions.csv")
## Rows: 15 Columns: 20
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (1): composition
## dbl (19): richness, Chlamydomonas, Cryptomonas, Monoraphidium, Cosmarium, St...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
comp_name <- compositions$composition

compositions <- compositions[,species]

compositions_list <- apply(compositions, 1, function(x){
  idx <- which(x==1)
  names(idx)
})

names(compositions_list) <- comp_name

6 Classifiers

6.1 Classifier plotting functions

classifier_plot_svm <- function(table, combination.nr){
  # cm <- classifier$confusion
  cm <- table
  ncol <- ncol(cm)
  cm <- apply(cm, 1, function(x) x/sum(x))
  
  cm_long <- data.frame(Predicted = factor(rep(rownames(cm),ncol), levels = rownames(cm)),
                        Truth = factor(rep(colnames(cm), each=ncol), levels = colnames(cm)),
                        Classification = as.vector(cm))
  
  plot <- cm_long %>%
    ggplot(aes(Predicted,Truth,fill=Classification))+
    geom_tile() +
    geom_text(aes(label = round(Classification, 3)), col="white") +
    scale_x_discrete(position = "top") +
    scale_y_discrete(limits = rev(levels(cm_long$Truth))) +
    scale_fill_viridis(option = "D", end=.8, discrete=FALSE)+
    theme(axis.text.x = element_text(angle = 90, hjust = 0))+
    theme(legend.text = element_text(size=14),
          axis.title = element_text(size=14),
          title = element_text(size=16),
          axis.text = element_text(size=13))+
    labs(title=paste("PPV of",combination.nr),fill="PPV")
  
  return(plot)
}


classifier_assessment_plot <- function(cf, combination.nr){

  cf.df <- cf$byClass[,1:4] %>%
    as.data.frame() %>%
    mutate(Species = gsub("Class: ", "", rownames(cf$byClass[,1:5]))) %>%
    rename("NPV" = "Neg Pred Value", "PPV" = "Pos Pred Value",
           "Sens" = "Sensitivity", "Spec" = "Specificity") %>%
    pivot_longer(cols = 1:4) %>%
    mutate(col = ifelse(value>=0.9,"1",
                        ifelse(value<0.9 & value>=0.8,"2",
                               ifelse(value<0.8 & value>=0.7,"3","4"))))

  plot <- cf.df %>%
    ggplot(aes(name,Species,fill=col))+
    geom_tile() +
    geom_text(aes(label = round(value, 3)), col="white") +
    scale_x_discrete(position = "top") +
    scale_y_discrete(limits = rev(levels(as.factor(cf.df$Species)))) +
    scale_fill_manual(values = c("#006837","#66bd63","#f46d43","#a50026"), breaks = c("1","2","3","4")) +
    theme(legend.text = element_text(size=14),
          title = element_text(size = 16),
          axis.title = element_blank(),
          axis.text = element_text(size=13),
          legend.position = "none")+
    labs(title=combination.nr, fill="")
  
  return(plot)
}

6.2 Training of classifiers

formula <- factor(species) ~ Area_ABD + Area_Filled + Aspect_Ratio +
      Average_Blue + Average_Green + Average_Red + Circle_Fit +
      Circularity + Compactness +
      Convexity + Diameter_ABD + Diameter_ESD + Edge_Gradient +
      Elongation + Feret_Angle_Max + Feret_Angle_Min + Fiber_Curl +
      Fiber_Straightness + Geodesic_Aspect_Ratio + Intensity + Length +
      Perimeter + Ratio_Blue_Green + Ratio_Red_Blue + Ratio_Red_Green +
      Roughness + Sigma_Intensity + Sum_Intensity + Symmetry + Transparency +
      Width

set.seed(8765)
# classifiers_18c <- list()
# classifiers_18c_plots <- list()
# classifiers_18c_assess_plots <- list()


completeList <- mclapply(1:length(compositions_list), function(i){
  
  if("Colpidium" %in% compositions_list[[i]]) {
    sub_trainingdata <- trainingdata %>%
      filter(species %in% c(compositions_list[[i]],"airbubbles","ColpidiumVacuoles","Debris","Small_unidentified",
                            "OtherCiliate","ChlamydomonasClumps","DigestedAlgae","DividingChlamydomonas"))
    sub_testdata <- testdata %>%
      filter(species %in% c(compositions_list[[i]],"airbubbles","ColpidiumVacuoles","Debris","Small_unidentified",
                            "OtherCiliate","ChlamydomonasClumps","DigestedAlgae","DividingChlamydomonas"))
  } else {
    sub_trainingdata <- trainingdata %>%
      filter(species %in% c(compositions_list[[i]],"airbubbles","Debris","OtherCiliate","ChlamydomonasClumps","Small_unidentified",
                            "DigestedAlgae","DividingChlamydomonas"))
    sub_testdata <- testdata %>%
      filter(species %in% c(compositions_list[[i]],"airbubbles","Debris","OtherCiliate","ChlamydomonasClumps","Small_unidentified",
                            "DigestedAlgae","DividingChlamydomonas"))
  }

  sub_trainingdata$species <- factor(sub_trainingdata$species)
  sub_testdata$species <- factor(sub_testdata$species)
  
  # split_up <- split(sub_trainingdata, f = sub_trainingdata$species)
  # subsamples <- lapply(split_up, function(x) {
  #   x %>% sample_n(ifelse(nrow(x) < 500, nrow(x), 500))})
  # sub_trainingdata <- do.call("rbind", subsamples)
  
  # # A stratified random split of the data
  # idx_train <- createDataPartition(sub_trainingdata$species,
  #                                  p = 0.7, # percentage of data as training
  #                                  list = FALSE)
  # 
  # sub_testdata <- sub_trainingdata[-idx_train,]
  # sub_trainingdata <- sub_trainingdata[idx_train,]
  
  obj <- tune(svm, formula, data = sub_trainingdata, kernel="radial",
            ranges = list(gamma = 2^(-8:2), cost = 2^(1:10)), probability = TRUE,
            tunecontrol = tune.control(sampling = "fix", best.model = T))
  
  classifiers_18c <- obj$best.model

  
  cf <- caret::confusionMatrix(predict(obj$best.model, newdata=sub_testdata %>% select(-species)),
                               sub_testdata$species)
  
  sub_testdata <- sub_testdata %>%
    dplyr::mutate(predicted = predict(obj$best.model, newdata=sub_testdata %>% select(-species)),
                  correct = species==predicted)
  
  sub_testdata_summ <- sub_testdata %>% 
    group_by(data, species, light, correct) %>%
    summarize(n = n()) %>%
    dplyr::mutate(perc_corr = n/sum(n),
                  composition = names(compositions_list)[i]) %>%
    dplyr::filter(correct==T)
  
  classified_test_data <- sub_testdata_summ
  
  classifiers_18c_plots <- classifier_plot_svm(table = cf$table,
                                               combination.nr = names(compositions_list)[[i]])
  
  classifiers_18c_assess_plots <- classifier_assessment_plot(cf, names(compositions_list)[[i]])
  
  list(classifiers_18c, classified_test_data, classifiers_18c_plots, classifiers_18c_assess_plots)
}, mc.cores = detectCores()-3)

classifiers_18c <- map(completeList, 1)
classified_test_data <- map(completeList, 2)
classifiers_18c_plots <- map(completeList, 3)
classifiers_18c_assess_plots <- map(completeList, 4)

classified_test_data <- do.call("rbind", classified_test_data) 

names(classifiers_18c_plots) <- names(classifiers_18c) <-
  names(classifiers_18c_assess_plots) <- comp_name

6.3 Assessing of classifiers

There are mainly 4 measures:

  • Sensitivity: the probability that an individuals is classified as species X given that the it is of species X: P(Classified as species X | is of species X)

  • Specificity: the probability that an individuals is not classified as species X given that it is not of species X: P(Not classified as species X | is not of species X)

  • Positive Predictive Value PPV: the probability that an individual is of species X given that it has been classified as species X: P(is of species X | is classified as species X)

  • Negative Predictive Value NPV: the probability that an individuals is not of species X given that it has not been classified as species X: P(is not of species X | has not been classified as species X)

PPV is the most important for us!

library(patchwork)
for(i in 1:length(classifiers_18c_plots)){
  print(classifiers_18c_plots[[i]] + classifiers_18c_assess_plots[[i]] + 
    plot_layout(widths = c(7,3)))
}

classified_test_data <- classified_test_data%>%
  dplyr::mutate(data=ifelse(data=="Late 2020","2020late",data),
                tot_n = n/perc_corr, data=factor(data, ordered = T,
                                                 levels=c("2020late","20220201","20220301","20220316","20220406","20220706")),
                light_num = as.numeric(gsub("%","",light)))

# classified_test_data %>%
#   dplyr::filter(!(species %in% c("Cryptomonas","Debris_and_other"))) %>%
#   ggplot(aes(data, perc_corr, col=composition, shape=light, size=tot_n)) +
#   geom_hline(yintercept = 0.8, col="red") +
#   geom_hline(yintercept = 0.9, col="green")+
#   geom_point(position = position_jitter(0.2))+
#   facet_wrap(~species, nrow = 3)+
#   theme_bw() +
#   scale_size_continuous(breaks = c(1,10,20,30)) +
#   labs(size="Sample size of evaluation data", y="Correct identification in percentage", x="Date of data collection",
#        title="Species identification across date, light intensities and compositions")

classified_test_data %>%
  dplyr::filter(species %in% c("Chlamydomonas", "Cosmarium", "Desmodesmus", "Monoraphidium", "Staurastrum1", "Staurastrum2", "Cryptomonas")) %>%
  ggplot(aes(light_num, perc_corr))+
  geom_hline(yintercept = 0.8, col="red") +
  geom_point(aes(fill=composition,size=tot_n),shape=21,col="black",alpha=0.3)+
  facet_wrap(~species, nrow = 3)+
  scale_size_continuous(breaks = c(1,10,20,30)) +
  geom_smooth(method = "lm")+
  theme_bw() +
  labs(size="Sample size of evaluation data", y="Correct species identification in percentage", x="Light intensity in percentage",
       title="Species identification across light intensities and compositions")
## `geom_smooth()` using formula 'y ~ x'

# classified_test_data %>%
#   dplyr::filter(!(species %in% c("Cryptomonas","Debris_and_other"))) %>%
#   ggplot(aes(tot_n,perc_corr, col=species))+
#   geom_point()+
#   theme_bw()

7 Save classifiers

saveRDS(classifiers_18c, "svm_flowcam_classifiers_18c_20220710_MergedData.rds")
# labels <- dd_all %>% 
#   group_by(species) %>%  
#   summarise(xPos = max(Area_Filled),
#             yPos = max((density(Area_Filled))$y))
# 
# library(directlabels)
# direct.label(qplot(Area_Filled,data=dd_all,colour=species,geom="density",log="x"))